Prerrequisites

library(tidyverse)
library(dslabs)
library(reshape2)
library(patchwork)
library(tidyquant)

Digits classification

Load the data.

mnist <- read_mnist()
glimpse(mnist)
List of 2
 $ train:List of 2
  ..$ images: int [1:60000, 1:784] 0 0 0 0 0 0 0 0 0 0 ...
  ..$ labels: int [1:60000] 5 0 4 1 9 2 1 3 1 4 ...
 $ test :List of 2
  ..$ images: int [1:10000, 1:784] 0 0 0 0 0 0 0 0 0 0 ...
  ..$ labels: int [1:10000] 7 2 1 0 4 1 4 9 5 9 ...

Save the features into a different variable.

features <- mnist$train$images
glimpse(features)
 int [1:60000, 1:784] 0 0 0 0 0 0 0 0 0 0 ...

Converting our data to matrix.

d <- matrix(features[1,], nrow = 28, byrow = TRUE)
digito <- melt(d)
digito

We plot the resulting image.

digito %>% 
  ggplot(aes(x = Var2, y = Var1)) +
  geom_raster(aes(fill = factor(value))) +
  scale_y_reverse() + scale_fill_grey() +
  theme(legend.position = "none")

p <- function(i){
    d <- matrix(features[i,], nrow = 28, byrow = TRUE)
    digito<-melt(d)
    ggplot(digito, aes(x = Var2, y = Var1)) +
    geom_raster(aes(fill=value)) +  coord_fixed() +
      theme_void() + theme(legend.position = "none") +
      scale_y_reverse()
  }
p(1) + p(2) + p(3) + p(4) + p(5) + p(6)

We get the total occurences for each digit.

etiquetas <- mnist$train$labels 
etiquetas %>%
  as_tibble() %>% 
  mutate(cant = 1) %>%
  group_by(value) %>%
    summarise(total = sum(cant))
`summarise()` ungrouping output (override with `.groups` argument)
d %>% 
  as_tibble()
features2 <- features %>% 
  as_tibble() %>% 
  mutate(digito = etiquetas,
         n = 1:60000) %>% 
  select(n, digito, everything())
features2
features_long <- features2 %>% 
  pivot_longer(cols = -c(n, digito)) %>% 
  mutate(Var1 = rep(1:28, times = 28 * 60000),
         Var2 = rep(1:28, each = 28, times = 60000)) %>%
  select(-name)
features_long

We create a function that selects a subset of the data and plots the result.

For example, taking the first 12 rows.

p <- function(i){
  features_long %>% 
    filter(n %in% i) %>%
    ggplot(aes(x = Var1, y = Var2)) +
    geom_raster(aes(fill = factor(value))) +
    scale_y_reverse() + scale_fill_grey() +
    theme(legend.position = "none") + 
    facet_wrap(~ n)
}

p(1:12)

tibble(Var1 = 3,
                  Var2 = 6,
                  lab = features_long %>% group_by(n) %>% distinct(digito) %>% pull(digito)) %>% 
  slice(2:4)

Getting now the second 12 rows:

mnist_clustering <- kmeans(features, centers = 10, nstart = 10)
did not converge in 10 iterationsQuick-TRANSfer stage steps exceeded maximum (= 3000000)Quick-TRANSfer stage steps exceeded maximum (= 3000000)Quick-TRANSfer stage steps exceeded maximum (= 3000000)
glimpse(mnist_centers)
 num [1:10, 1:784] 0 0 0 0 0 0 0 0 0 0 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:10] "1" "2" "3" "4" ...
  ..$ : NULL
LS0tDQp0aXRsZTogIkJhc2ljIGNsYXNzaWZpY2F0aW9uIGV4YW1wbGVzIg0KYXV0aG9yOiAiUGFibG8gQmVuYXZpZGVzLUhlcnJlcmEiDQpkYXRlOiAyMDIwLTA2LTI3DQpvdXRwdXQ6IA0KICBodG1sX25vdGVib29rOg0KICAgIHRvYzogVFJVRQ0KICAgIHRvY19mbG9hdDogVFJVRQ0KICAgIHRoZW1lOiBzcGFjZWxhYg0KICAgIGhpZ2hsaWdodDogdGFuZ28NCi0tLQ0KDQpgYGB7ciBzZXR1cCwgaW5jbHVkZT1GQUxTRX0NCmtuaXRyOjpvcHRzX2NodW5rJHNldChtZXNzYWdlID0gVFJVRSwgd2FybmluZyA9IFRSVUUpDQpgYGANCg0KIyBQcmVycmVxdWlzaXRlcw0KDQpgYGB7ciBwa2dzLCBtZXNzYWdlPUZBTFNFfQ0KbGlicmFyeSh0aWR5dmVyc2UpDQpsaWJyYXJ5KGRzbGFicykNCmxpYnJhcnkocmVzaGFwZTIpDQpsaWJyYXJ5KHBhdGNod29yaykNCmxpYnJhcnkodGlkeXF1YW50KQ0KYGBgDQoNCiMgRGlnaXRzIGNsYXNzaWZpY2F0aW9uDQoNCkxvYWQgdGhlIGRhdGEuDQoNCmBgYHtyIGRpZ2l0c30NCm1uaXN0IDwtIHJlYWRfbW5pc3QoKQ0KZ2xpbXBzZShtbmlzdCkNCmBgYA0KDQpTYXZlIHRoZSBmZWF0dXJlcyBpbnRvIGEgZGlmZmVyZW50IHZhcmlhYmxlLg0KDQpgYGB7cn0NCmZlYXR1cmVzIDwtIG1uaXN0JHRyYWluJGltYWdlcw0KZ2xpbXBzZShmZWF0dXJlcykNCmBgYA0KDQpDb252ZXJ0aW5nIG91ciBkYXRhIHRvIG1hdHJpeC4NCg0KYGBge3IgZGlnaXQgbWF0cml4fQ0KZCA8LSBtYXRyaXgoZmVhdHVyZXNbMSxdLCBucm93ID0gMjgsIGJ5cm93ID0gVFJVRSkNCmRpZ2l0byA8LSBtZWx0KGQpDQpkaWdpdG8NCmBgYA0KDQpXZSBwbG90IHRoZSByZXN1bHRpbmcgaW1hZ2UuDQoNCmBgYHtyIGRpZ2l0IHBsb3R9DQpkaWdpdG8gJT4lIA0KICBnZ3Bsb3QoYWVzKHggPSBWYXIyLCB5ID0gVmFyMSkpICsNCiAgZ2VvbV9yYXN0ZXIoYWVzKGZpbGwgPSBmYWN0b3IodmFsdWUpKSkgKw0KICBzY2FsZV95X3JldmVyc2UoKSArIHNjYWxlX2ZpbGxfZ3JleSgpICsNCiAgdGhlbWUobGVnZW5kLnBvc2l0aW9uID0gIm5vbmUiKQ0KYGBgDQoNCmBgYHtyIGRpZ2l0IHBsb3QgZnVuY3Rpb259DQpwIDwtIGZ1bmN0aW9uKGkpew0KICAgIGQgPC0gbWF0cml4KGZlYXR1cmVzW2ksXSwgbnJvdyA9IDI4LCBieXJvdyA9IFRSVUUpDQogICAgZGlnaXRvPC1tZWx0KGQpDQogICAgZ2dwbG90KGRpZ2l0bywgYWVzKHggPSBWYXIyLCB5ID0gVmFyMSkpICsNCiAgICBnZW9tX3Jhc3RlcihhZXMoZmlsbD12YWx1ZSkpICsgIGNvb3JkX2ZpeGVkKCkgKw0KICAgICAgdGhlbWVfdm9pZCgpICsgdGhlbWUobGVnZW5kLnBvc2l0aW9uID0gIm5vbmUiKSArDQogICAgICBzY2FsZV95X3JldmVyc2UoKQ0KICB9DQpgYGANCg0KYGBge3IgZGlnaXQgcGxvdCBleGFtcGxlfQ0KcCgxKSArIHAoMikgKyBwKDMpICsgcCg0KSArIHAoNSkgKyBwKDYpDQpgYGANCg0KV2UgZ2V0IHRoZSB0b3RhbCBvY2N1cmVuY2VzIGZvciBlYWNoIGRpZ2l0Lg0KDQpgYGB7ciBkaWdpdCBsYWJlbCBzdW1tYXJ5fQ0KZXRpcXVldGFzIDwtIG1uaXN0JHRyYWluJGxhYmVscyANCmV0aXF1ZXRhcyAlPiUNCiAgYXNfdGliYmxlKCkgJT4lIA0KICBtdXRhdGUoY2FudCA9IDEpICU+JQ0KICBncm91cF9ieSh2YWx1ZSkgJT4lDQogICAgc3VtbWFyaXNlKHRvdGFsID0gc3VtKGNhbnQpKQ0KYGBgDQoNCmBgYHtyfQ0KZCAlPiUgDQogIGFzX3RpYmJsZSgpDQpgYGANCg0KDQpgYGB7cn0NCmZlYXR1cmVzMiA8LSBmZWF0dXJlcyAlPiUgDQogIGFzX3RpYmJsZSgpICU+JSANCiAgbXV0YXRlKGRpZ2l0byA9IGV0aXF1ZXRhcywNCiAgICAgICAgIG4gPSAxOjYwMDAwKSAlPiUgDQogIHNlbGVjdChuLCBkaWdpdG8sIGV2ZXJ5dGhpbmcoKSkNCmZlYXR1cmVzMg0KYGBgDQoNCg0KDQpgYGB7cn0NCmZlYXR1cmVzX2xvbmcgPC0gZmVhdHVyZXMyICU+JSANCiAgcGl2b3RfbG9uZ2VyKGNvbHMgPSAtYyhuLCBkaWdpdG8pKSAlPiUgDQogIG11dGF0ZShWYXIxID0gcmVwKDE6MjgsIHRpbWVzID0gMjggKiA2MDAwMCksDQogICAgICAgICBWYXIyID0gcmVwKDE6MjgsIGVhY2ggPSAyOCwgdGltZXMgPSA2MDAwMCkpICU+JQ0KICBzZWxlY3QoLW5hbWUpDQpmZWF0dXJlc19sb25nDQpgYGANCg0KV2UgY3JlYXRlIGEgZnVuY3Rpb24gdGhhdCBzZWxlY3RzIGEgc3Vic2V0IG9mIHRoZSBkYXRhIGFuZCBwbG90cyB0aGUgcmVzdWx0Lg0KDQpGb3IgZXhhbXBsZSwgdGFraW5nIHRoZSBmaXJzdCAxMiByb3dzLg0KDQpgYGB7cn0NCnAgPC0gZnVuY3Rpb24oaSl7DQogIGZlYXR1cmVzX2xvbmcgJT4lIA0KICAgIGZpbHRlcihuICVpbiUgaSkgJT4lDQogICAgZ2dwbG90KGFlcyh4ID0gVmFyMSwgeSA9IFZhcjIpKSArDQogICAgZ2VvbV9yYXN0ZXIoYWVzKGZpbGwgPSBmYWN0b3IodmFsdWUpKSkgKw0KICAgIHNjYWxlX3lfcmV2ZXJzZSgpICsgc2NhbGVfZmlsbF9ncmV5KCkgKw0KICAgIHRoZW1lKGxlZ2VuZC5wb3NpdGlvbiA9ICJub25lIikgKyANCiAgICBmYWNldF93cmFwKH4gbikNCn0NCg0KcCgxOjEyKQ0KYGBgDQoNCmBgYHtyfQ0KdGliYmxlKFZhcjEgPSAzLA0KICAgICAgICAgICAgICAgICAgVmFyMiA9IDYsDQogICAgICAgICAgICAgICAgICBsYWIgPSBmZWF0dXJlc19sb25nICU+JSBncm91cF9ieShuKSAlPiUgZGlzdGluY3QoZGlnaXRvKSAlPiUgcHVsbChkaWdpdG8pKSAlPiUgDQogIHNsaWNlKDI6NCkNCmBgYA0KDQoNCkdldHRpbmcgbm93IHRoZSBzZWNvbmQgMTIgcm93czoNCg0KYGBge3J9DQpwKDEzOjI0KQ0KYGBgDQoNCg0KDQoNCg0KDQpgYGB7cn0NCm1uaXN0X2NsdXN0ZXJpbmcgPC0ga21lYW5zKGZlYXR1cmVzLCBjZW50ZXJzID0gMTAsIG5zdGFydCA9IDEwKQ0KYGBgDQoNCmBgYHtyfQ0KbW5pc3RfY2VudGVycyA8LSBtbmlzdF9jbHVzdGVyaW5nJGNlbnRlcnMNCmdsaW1wc2UobW5pc3RfY2VudGVycykNCmBgYA0KDQoNCg0KDQo=